{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import lightgbm as lgb\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import KFold\n",
    "import datetime\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train= pd.read_csv('../../Large_output/train_clean_merge.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas.api.types import is_datetime64_any_dtype as is_datetime\n",
    "from pandas.api.types import is_categorical_dtype\n",
    "\n",
    "def reduce_mem_usage(df, use_float16=False):\n",
    "    \"\"\"\n",
    "    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        \n",
    "    \"\"\"\n",
    "    \n",
    "    start_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage of dataframe is {:.2f} MB\".format(start_mem))\n",
    "    \n",
    "    for col in df.columns:\n",
    "        if is_datetime(df[col]) or is_categorical_dtype(df[col]):\n",
    "            continue\n",
    "        col_type = df[col].dtype\n",
    "        \n",
    "        if col_type != object:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == \"int\":\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                    df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)\n",
    "        else:\n",
    "            df[col] = df[col].astype(\"category\")\n",
    "\n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    print(\"Memory usage after optimization is: {:.2f} MB\".format(end_mem))\n",
    "    print(\"Decreased by {:.1f}%\".format(100 * (start_mem - end_mem) / start_mem))\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def features_engineering(df):\n",
    "    \n",
    "    # Sort by localtime\n",
    "    df.sort_values(\"local_time\")\n",
    "    df.reset_index(drop=True)\n",
    "    \n",
    "    # Add more features\n",
    "    df[\"local_time\"] = pd.to_datetime(df[\"local_time\"],format=\"%Y-%m-%d %H:%M:%S\")\n",
    "    df[\"hour\"] = df[\"local_time\"].dt.hour\n",
    "    df[\"weekend\"] = df[\"local_time\"].dt.weekday\n",
    "    df['square_feet'] =  np.log1p(df['square_feet'])\n",
    "    \n",
    "    \n",
    "    # Encode Categorical Data\n",
    "    le = LabelEncoder()\n",
    "    df[\"primary_use\"] = le.fit_transform(df[\"primary_use\"])\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Memory usage of dataframe is 2638.86 MB\n",
      "Memory usage after optimization is: 733.78 MB\n",
      "Decreased by 72.2%\n"
     ]
    }
   ],
   "source": [
    "df_train = reduce_mem_usage(df_train,use_float16=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train['local_time'] = pd.to_datetime(df_train['local_time'])\n",
    "df_train['month'] = df_train['local_time'].dt.month\n",
    "df_train['group'] = df_train['month']\n",
    "df_train['group'].replace((1,2,3,4,5,6), 1,inplace=True)\n",
    "df_train['group'].replace((7,8,9,10,11,12), 2, inplace=True)\n",
    "df_train['group'].value_counts()\n",
    "train_engineer = features_engineering(df_train)\n",
    "train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\\\n",
    "=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_list = ['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_half_1 = train_engineer.loc[train_engineer.group==1][total_list]\n",
    "X_half_2 = train_engineer.loc[train_engineer.group==2][total_list]\n",
    "y_half_1 = np.log1p(train_engineer.loc[train_engineer.group==1]['meter_reading'])\n",
    "y_half_2 = np.log1p(train_engineer.loc[train_engineer.group==2]['meter_reading'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "categoricals = [\"building_id\", \"site_id\", \"meter\", \"primary_use\",  \"weekend\",'is_holiday']\n",
    "d_half_1 = lgb.Dataset(X_half_1, label=y_half_1, categorical_feature=categoricals, free_raw_data=False)\n",
    "d_half_2 = lgb.Dataset(X_half_2, label=y_half_2, categorical_feature=categoricals, free_raw_data=False)\n",
    "watchlist_1 = [d_half_2, d_half_1]\n",
    "watchlist_2 = [d_half_1, d_half_2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "params1 = {\n",
    "    \"objective\": \"regression\",\n",
    "    'bagging_fraction':0.9,\n",
    "    \"boosting\": \"gbdt\",\n",
    "    \"num_leaves\": 3000,\n",
    "    \"learning_rate\": 0.05,\n",
    "    \"feature_fraction\": 0.9,\n",
    "    \"reg_lambda\":3.0 ,\n",
    "    'reg_alpha': 0.1 ,\n",
    "    'max_depth':12,\n",
    "    'min_child_weight':  20,\n",
    "    'min_data_in_leaf':20,\n",
    "    \"metric\": \"rmse\"\n",
    "}\n",
    "params2 = {\n",
    "    \"objective\": \"regression\",\n",
    "    'bagging_fraction': 0.1,\n",
    "    \"boosting\": \"gbdt\",\n",
    "    \"num_leaves\": 1000,\n",
    "    \"learning_rate\": 0.05,\n",
    "    \"feature_fraction\":  0.9,\n",
    "    \"reg_lambda\": 0.1,\n",
    "    'reg_alpha':3.0 ,\n",
    "    'max_depth':12,\n",
    "    'min_child_weight':3,\n",
    "    'min_data_in_leaf':200,\n",
    "    \"metric\": \"rmse\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Building model with first half and validating on second half:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/local/home/ningzesun/.local/lib/python3.6/site-packages/lightgbm/basic.py:1243: UserWarning: Using categorical_feature in Dataset.\n",
      "  warnings.warn('Using categorical_feature in Dataset.')\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training until validation scores don't improve for 200 rounds\n",
      "[200]\ttraining's rmse: 0.598892\tvalid_0's rmse: 0.866394\n",
      "[400]\ttraining's rmse: 0.55618\tvalid_0's rmse: 0.867651\n",
      "Early stopping, best iteration is:\n",
      "[256]\ttraining's rmse: 0.583824\tvalid_0's rmse: 0.86591\n",
      "Building model with second half and validating on first half:\n",
      "Training until validation scores don't improve for 200 rounds\n",
      "[200]\ttraining's rmse: 0.649583\tvalid_0's rmse: 0.836275\n",
      "Early stopping, best iteration is:\n",
      "[184]\ttraining's rmse: 0.654797\tvalid_0's rmse: 0.836056\n"
     ]
    }
   ],
   "source": [
    "print(\"Building model with first half and validating on second half:\")\n",
    "model_half_1 = lgb.train(params1, train_set=d_half_1, num_boost_round=10000, valid_sets=watchlist_1, verbose_eval=200, early_stopping_rounds=200)\n",
    "\n",
    "print(\"Building model with second half and validating on first half:\")\n",
    "model_half_2 = lgb.train(params2, train_set=d_half_2, num_boost_round=10000, valid_sets=watchlist_2, verbose_eval=200, early_stopping_rounds=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_feature = pd.read_csv('../../Large_output/test_merge.csv')\n",
    "test_feature = features_engineering(test_feature)\n",
    "row_ids = test_feature[['row_id']]\n",
    "test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\\\n",
    "                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 834/834 [06:22<00:00,  2.18it/s]\n"
     ]
    }
   ],
   "source": [
    "models = [model_half_1,model_half_2]\n",
    "from tqdm import tqdm\n",
    "i=0\n",
    "res=[]\n",
    "step_size = 50000\n",
    "for j in tqdm(range(int(np.ceil(test_feature.shape[0]/50000)))):\n",
    "    res.append(sum(np.expm1([model.predict(test_feature.iloc[i:i+step_size]) for model in models])/len(models)))\n",
    "    i+=step_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "res = np.concatenate(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>row_id</th>\n",
       "      <th>meter_reading</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>105.533187</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>68.027624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>10.686327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>188.630619</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1102.553080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697595</th>\n",
       "      <td>41697595</td>\n",
       "      <td>6.004035</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697596</th>\n",
       "      <td>41697596</td>\n",
       "      <td>4.176343</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697597</th>\n",
       "      <td>41697597</td>\n",
       "      <td>7.992368</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697598</th>\n",
       "      <td>41697598</td>\n",
       "      <td>178.120886</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41697599</th>\n",
       "      <td>41697599</td>\n",
       "      <td>9.452605</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41697600 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            row_id  meter_reading\n",
       "0                0     105.533187\n",
       "1                1      68.027624\n",
       "2                2      10.686327\n",
       "3                3     188.630619\n",
       "4                4    1102.553080\n",
       "...            ...            ...\n",
       "41697595  41697595       6.004035\n",
       "41697596  41697596       4.176343\n",
       "41697597  41697597       7.992368\n",
       "41697598  41697598     178.120886\n",
       "41697599  41697599       9.452605\n",
       "\n",
       "[41697600 rows x 2 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_feature['meter_reading']=np.clip(res, 0, a_max=None)\n",
    "test_feature.loc[(test_feature['site_id']==0) & (test_feature['meter']==0),'meter_reading']\\\n",
    "=test_feature.loc[(test_feature['site_id']==0) & (test_feature['meter']==0),'meter_reading'].mul(3.4118)\n",
    "df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})\n",
    "df_result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_result.to_csv('../../Large_output/lgb_half_clean.csv',index = False)\n",
    "# 1.089"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
